suppressPackageStartupMessages(
library(tidyverse)
)
library(leaflet)
Establish file path to the project directory. This will be used to access data.
project.path <- rprojroot::find_rstudio_root_file()
data.path <- file.path(project.path, "data", "2019-04-30")
The clean_up() function is used to aid in the standardization of the data. The processes described below generally eliminate some of most common string features that can cause erroneous mismatching.
tolower()- all column names and character column values are converted to lower case strings.trimws() - leading and trailing whitespace is removed from all column names and character columns.gsub()- more than one space is removed from strings in character columns.clean_up <- function(x) {
names(x) <- tolower(names(x))
names(x) <- trimws(names(x))
final.df <- dplyr::mutate_if(x,
is.character,
list(~trimws(tolower(.))))
# remove instances of more than one space
final.df <- dplyr::mutate_if(final.df,
is.character,
list(~gsub("[' ']{2,}", " ", .)))
final.df
}
On 4/30/2019, A.J. Smith suggested that we drop stations sampled for HABs monitoring (i.e., “13-gunk-40.3”, “13-gunk-t35-0.2”, “13-gunk-37.7”, and “13-lgun-6.0”).
ex.station.vec <- c("13-gunk-40.3",
"13-gunk_t35-0.2",
"13-gunk-37.7",
"13-lgun-6.0")
Import Wallkill 2017 and 2018 station information and combine these into a single data frame. 2018 special study station “13-walk-0.8” is considered equivalent to “13-walk-0.7”, and therefore is relabeled as “13-walk-0.8”.
stations17.df <- file.path(data.path,
"sites",
"WallkillSites2017.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
rename(station = sbu.id,
survey = ras)
stations18.df <- file.path(data.path,
"sites",
"WallkillSites2018.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
rename(station = site_id,
stream = name) %>%
mutate(station = if_else(station %in% "13-walk-0.8", "13-walk-0.7", station))
stations.df <- bind_rows(stations17.df, stations18.df) %>%
mutate(rivmile = gsub(".*-", "", station),
location = gsub("-.*", "", station),
station = paste("13", station, sep = "-")) %>%
filter(!station %in% ex.station.vec)
Clear the global environment of unnecessary objects.
rm(stations17.df, stations18.df)
Import 2018 PEERS stations collected within the Wallkill basin.
stations_peers.df <- file.path(data.path,
"peers",
"peers_stations.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
select(station, latitude, longitude) %>%
mutate(type = "peers")
Add the PEERS stations to the stations.df data frame.
stations.df <- bind_rows(stations.df, stations_peers.df)
Import USGS gage stations found within the Wallkill basin.
gage.df <- file.path(data.path,
"gage",
"GageSites.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
mutate(type = "gage")
Add the gaged stations to the stations.df data frame.
stations.df <- bind_rows(stations.df, gage.df) %>%
mutate(gage = if_else(type %in% "gage", TRUE, FALSE))
Zachary Smith ordered the Wallkill 2017 and 2018 samples from upstream to downstream relative to the mainstem Wallkill.
station_order.df <- file.path(data.path,
"finalized",
"wallkill_station-order_up-to-down.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
filter(!station %in% ex.station.vec)
stations.df <- left_join(station_order.df, stations.df, by = "station")
Import Wallkill 2017 and 2018 BAP data and combine these into a single data frame.
clean_up() function.date column is converted to a date classstation and date, and subsequently the mean and median BAP scores are calculated for these aggregatesbap.df <- c("WALLKILL2017_BAP.csv",
"WALLKILL2018_BAP.csv") %>%
map_df(function(file.i) {
file.path(data.path,
"bap",
file.i) %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA"))
}) %>%
clean_up() %>%
rename(station = location.station) %>%
mutate(station = paste("13", station, sep = "-"),
rivmile = factor(rivmile, sort(unique(rivmile), decreasing = TRUE)),
date = as.Date(date, "%m/%d/%Y")) %>%
group_by(station, date) %>%
mutate(replicates = n(),
mean_bap = mean(bap, na.rm = TRUE),
median_bap = median(bap, na.rm = TRUE)) %>%
ungroup() %>%
filter(!station %in% c("qker-0.9",
"walk-35.6",
"wklei-0.6"),
!station %in% ex.station.vec)
The data is exported as a CSV and manually added to the Wallkill SharePoint “finalized” folder.
data.table::fwrite(bap.df,
file.path(data.path,
"finalized",
"wallkill_bap_2017-2018.csv"))
Import the Wallkill average chemistry data.
tp.df <- file.path(data.path,
"chemistry",
"Wallkill_AVG_chem.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings=c(""," ","NA")) %>%
clean_up() %>%
rename(station = sbuid) %>%
select(station,
tp_avg,
tp_log10)
Import Wallkill 2017 and 2018 Special Studies and 2018 PEERS chemistry data. Perform general data standardization with the clean_up() function. The 2018 Special Study data was QAQCed by Gavin Lemley (2019-04-29) using an R script developed by Alene Onion and Gavin Lemley. However, the 2017 Special Study chemistry and the 2018 PEERS data have not been QAQCed.
chem2017.df <- file.path(data.path,
"chemistry",
"2017_wallkill_chem_qaqcd-2019-04-29.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings = c(""," ","NA"),
colClasses = c("fraction" = "character")) %>%
clean_up()
chem2018.df <- file.path(data.path,
"chemistry",
"2018_wallkill_chem_qaqcd-2019-03-29.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings = c(""," ","NA"),
colClasses = c("fraction" = "character")) %>%
clean_up() %>%
mutate(project_name = "wallkill special study 2018")
peers2018.df <- file.path(data.path,
"peers",
"PEERS.wallkill.output.site.names.fixed.csv") %>%
read.csv(stringsAsFactors = FALSE,
na.strings = c(""," ","NA"),
colClasses = c("fraction" = "character")) %>%
clean_up() %>%
mutate(fraction = "t")
We will only retain the columns present in the QAQCed 2018 Special Study data. The script below identifies any columns present in the 2018 Special Study data but missing in the 2018 PEERS data. We will want to add these columns to the 2018 PEERS data, if possible.
names(chem2018.df)[!names(chem2018.df) %in% names(peers2018.df)]
## [1] "project_name" "siteid"
“project_name” was the one of the columns which was present in the 2018 Special Study data but absent from the 2018 PEERS data. The script below adds the column “project_name” to the 2018 PEERS data frame. The column is populated with the character string “wallkill peers 2018”, which follows the labeling scheme found in the 2018 Special Study data (i.e., wallkill special study 2018).
peers2018.df$project_name <- "wallkill peers 2018"
“siteid” is present in the 2018 Special Study data but absent from the 2018 PEERS data. The script below adds the column “siteid” to the 2018 PEERS data frame. The column is populated by extracting the basin, location, and rivermile elements from “sys_sample_code”, which follows the same pattern found in the 2018 Special Study data.
The regex was difficult to solve. I found the answer to this stackoverflow question to be helpful (https://stackoverflow.com/questions/25448921/regex-to-extract-all-characters-before-the-second-dash-hyphen). The description below is modified from Casimir et Hippolyte answer in the link above.
peers2018.df$siteid <- str_extract(peers2018.df$sys_sample_code,
"\\A(?:[^-]++-??){3}")
Now that the 2018 PEERS data contains all of the columns present in the 2018 RAS data, the 2018 PEERS data is subset to only include columns that are found in the 2018 RAS data.
peers2018.df <- peers2018.df[, names(peers2018.df) %in% names(chem2018.df)]
The Wallkill 2017 and 2018 Special Studies and 2018 PEERS chemistry data are combined into a single data frame.
bind_rows()select()chem.df <- bind_rows(chem2017.df, chem2018.df, peers2018.df) %>%
mutate(date_time = if_else(project_name == "wallkill peers 2018",
as.POSIXct(sample_date, "%m/%d/%Y", tz = "EST"),
as.POSIXct(sample_date, "%Y-%m-%d %H:%M:%S", tz = "EST")),
date_time = as.character(date_time)) %>%
rename(station = siteid) %>%
select(project_name, sys_sample_code, station,
date_time, everything(), -sample_date) %>%
filter(!station %in% ex.station.vec) %>%
mutate(station = if_else(station %in% "13-walk-0.8", "13-walk-0.7", station))
The data is exported as a CSV and manually added to the Wallkill SharePoint “finalized” folder.
data.table::fwrite(chem.df,
file.path(data.path,
"finalized",
"wallkill_chem_ras-2017-2018_peers-2018.csv"))
Clear the global environment of unnecessary objects.
rm(chem2017.df, chem2018.df, peers2018.df)
Import he Wallkill NHD clip created by Charles Stoll as a SpatialLinesDataFrame.
nhd.gdb <- file.path(project.path, "data", "gis", "Wallkill_Clip_NHD.gdb")
streams.polyline <- rgdal::readOGR(dsn = nhd.gdb, layer = "WALK_0_7_NHD_CLIP")
## OGR data source with driver: OpenFileGDB
## Source: "C:\Users\zmsmith\OneDrive - New York State Office of Information Technology Services\project\nysdec\wallkill\data\gis\Wallkill_Clip_NHD.gdb", layer: "WALK_0_7_NHD_CLIP"
## with 9129 features
## It has 15 fields
## Warning in rgdal::readOGR(dsn = nhd.gdb, layer = "WALK_0_7_NHD_CLIP"): Z-
## dimension discarded
Create a function to standardize leaflet plots.
leaflet_plot <- function(x) {
pal <- colorFactor(c("#508b1a", "#551a8b"), domain = c(unique(x$gage)))
leaflet(x) %>%
addTiles() %>%
addPolylines(data = streams.polyline,
weight = 2,
opacity = 1) %>%
addCircleMarkers(~longitude, ~latitude,
color = ~pal(gage),
stroke = FALSE, fillOpacity = 1.00,
popup = paste(
paste("<b>Station:</b>", x$station),
paste("<b>Stream:</b>", x$stream),
paste("<b>Longitude:</b>", x$longitude),
paste("<b>Latitude:</b>", x$latitude),
paste("<b>Type:</b>", x$type),
sep = "<br/>"
))
}
Plot the Wallkill locations on an interactive map.
leaflet_plot(stations.df)